Pilot data Parse bio 4 iPSC lines NPCs, 3 batches
Load libraries
library(CelltypeR)
Warning: replacing previous import ‘data.table::last’ by ‘dplyr::last’ when loading ‘CelltypeR’
Warning: replacing previous import ‘data.table::first’ by ‘dplyr::first’ when loading ‘CelltypeR’
Warning: replacing previous import ‘data.table::between’ by ‘dplyr::between’ when loading ‘CelltypeR’
Warning: replacing previous import ‘dplyr::filter’ by ‘flowCore::filter’ when loading ‘CelltypeR’
Warning: replacing previous import ‘ggplot2::margin’ by ‘randomForest::margin’ when loading ‘CelltypeR’
Warning: replacing previous import ‘dplyr::combine’ by ‘randomForest::combine’ when loading ‘CelltypeR’
Warning: replacing previous import ‘flowCore::filter’ by ‘dplyr::filter’ when loading ‘CelltypeR’
Warning: replacing previous import ‘flowViz::contour’ by ‘graphics::contour’ when loading ‘flowStats’
Attaching package: ‘CelltypeR’
The following object is masked from ‘package:ggplot2’:
annotate
seu
An object of class Seurat
58395 features across 23769 samples within 1 assay
Active assay: RNA (58395 features, 2500 variable features)
2 dimensional reductions calculated: pca, umap
Idents(seu) <- 'sample'
VlnPlot(seu, pt.size = 0.001, features = c("nFeature_RNA"))
VlnPlot(seu, pt.size = 0.001, features = "nCount_RNA")
VlnPlot(seu, pt.size = 0.001, features = "percent.mt")
Add in labels for batch and disease status and line
#library("CelltypeR")
# CelltypeR library is a library I (Rhalena) made for flow cytometry but uses the seurat object and I made a quick add annotations function.
Seurat_Parse12sample <- seu
# here is the function
annotate <- function(seu, annotations, to_label, annotation_name = "CellType"){
Idents(seu) <- to_label
names(annotations) <- levels(seu)
seu <- RenameIdents(seu, annotations)
seu <- AddMetaData(object=seu, metadata=Idents(seu), col.name = annotation_name)
}
Idents(Seurat_Parse12sample) <- "sample"
sample.levels <- levels(Seurat_Parse12sample)
# should give the order of sample
# test
Seurat_Parse12sample <- annotate(Seurat_Parse12sample, annotations = sample.levels, to_label = "sample",annotation_name = "sample.test")
table(Seurat_Parse12sample$sample.test)
x2965B3 x2965B1 TD07B3 x3448B1 x3448B2 TD22B3 TD07B2 x3448B3 x2965B2
1918 2353 1690 2400 2194 2894 2124 1530 1686
TD07B1 TD22B2 TD22B1
1496 2186 1298
table(Seurat_Parse12sample$sample)
TD07B1 TD07B2 TD07B3 TD22B1 TD22B2 TD22B3 x2965B1 x2965B2 x2965B3
1496 2124 1690 1298 2186 2894 2353 1686 1918
x3448B1 x3448B2 x3448B3
2400 2194 1530
# these match
#input vector we got from the seurat object
# Define regular expression to match first part of the string
pattern <- "^[A-Za-z]+"
# Use gsub() to replace the first part of the string with an empty string
sample.levels.new <- gsub(pattern, "", sample.levels)
# Extract B1, B2, B3 from new vector
batch <- gsub(".*B", "B", sample.levels.new)
Seurat_Parse12sample <- annotate(Seurat_Parse12sample, annotations = batch, to_label = "sample",annotation_name = "Batch")
table(Seurat_Parse12sample$Batch)
B3 B1 B2
8032 7547 8190
table(Seurat_Parse12sample$Batch,Seurat_Parse12sample$sample)
TD07B1 TD07B2 TD07B3 TD22B1 TD22B2 TD22B3 x2965B1 x2965B2 x2965B3
B3 0 0 1690 0 0 2894 0 0 1918
B1 1496 0 0 1298 0 0 2353 0 0
B2 0 2124 0 0 2186 0 0 1686 0
x3448B1 x3448B2 x3448B3
B3 0 0 1530
B1 2400 0 0
B2 0 2194 0
# add the cell line name
# sample vector is still the input vector
# Define regular expression to remove B1, B2, and B3
pattern <- "B[1-3]$"
# Use gsub() to remove B1, B2, and B3 from original vector
sample.levels.new <- gsub(pattern, "", sample.levels)
# Extract starting values from new vector
ipscline <- gsub("B[1-3]$", "", sample.levels.new)
ipscline
[1] "x2965" "x2965" "TD07" "x3448" "x3448" "TD22" "TD07" "x3448" "x2965"
[10] "TD07" "TD22" "TD22"
Seurat_Parse12sample <- annotate(Seurat_Parse12sample, annotations = ipscline, to_label = "sample",annotation_name = "IPSC_Line")
table(Seurat_Parse12sample$IPSC_Line)
x2965 TD07 x3448 TD22
5957 5310 6124 6378
table(Seurat_Parse12sample$IPSC_Line,Seurat_Parse12sample$sample)
TD07B1 TD07B2 TD07B3 TD22B1 TD22B2 TD22B3 x2965B1 x2965B2 x2965B3
x2965 0 0 0 0 0 0 2353 1686 1918
TD07 1496 2124 1690 0 0 0 0 0 0
x3448 0 0 0 0 0 0 0 0 0
TD22 0 0 0 1298 2186 2894 0 0 0
x3448B1 x3448B2 x3448B3
x2965 0 0 0
TD07 0 0 0
x3448 2400 2194 1530
TD22 0 0 0
# add disease status
# we need to know the order of the lines
Idents(Seurat_Parse12sample) <- "IPSC_Line"
line.levels <- levels(Seurat_Parse12sample)
line.levels
[1] "x2965" "TD07" "x3448" "TD22"
PDstatus <- c("PD","PD","Con","Con") # if TD07 and 2965 are PD lines and TD22 and 3448 are control lines
Seurat_Parse12sample <- annotate(Seurat_Parse12sample, annotations = PDstatus, to_label = "IPSC_Line",annotation_name = "DiseaseStatus")
table(Seurat_Parse12sample$DiseaseStatus)
PD Con
11267 12502
table(Seurat_Parse12sample$DiseaseStatus,Seurat_Parse12sample$IPSC_Line)
x2965 TD07 x3448 TD22
PD 5957 5310 0 0
Con 0 0 6124 6378
table(Seurat_Parse12sample$Batch,Seurat_Parse12sample$IPSC_Line)
x2965 TD07 x3448 TD22
B3 1918 1690 1530 2894
B1 2353 1496 2400 1298
B2 1686 2124 2194 2186
Save info
saveRDS(Seurat_Parse12sample, "Parse12sample4lines3batchJuly7.RDS")
Align the cell lines and batches, we will align across the 12 samples
# make a list of seurat objects by our cell type variable
sublist <- SplitObject(seu, split.by = "sample")
# normalize and find variable features
for (i in 1:length(sublist)){
sublist[[i]] <- NormalizeData(sublist[[i]], verbose = FALSE)
sublist[[i]] <- FindVariableFeatures(sublist[[i]], selection.method = "vst")
}
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
# Create an empty Seurat object to store the integrated data
# Take the first Seurat object from the list as the starting point
integrated_seurat <- subset(sublist[[1]])
# Iterate over the list of Seurat objects
for (i in 1:length(sublist)) {
# Rename the 'orig.ident' metadata inside the seurat object to match the object name in the list
sublist[[i]]$orig.ident <- names(sublist)[i]
}
sample.list <- sublist
for (i in 1:length(sample.list)) {
# Normalize and scale the data
sample.list[[i]] <- NormalizeData(sample.list[[i]], verbose = FALSE)
sample.list[[i]] <- ScaleData(sample.list[[i]], verbose = FALSE)
# Find variable features
sample.list[[i]] <- FindVariableFeatures(sample.list[[i]], selection.method = "vst")
# Get the variable features
variable_features <- VariableFeatures(sample.list[[i]])
# Run PCA with the variable features
sample.list[[i]] <- RunPCA(sample.list[[i]], verbose = FALSE, npcs = 30, features = variable_features)
}
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
int.anchors <- FindIntegrationAnchors(object.list = sample.list, dims = 1:30, reduction = "rpca")
Computing 2000 integration features
Scaling features for provided objects
| | 0 % ~calculating
|+++++ | 8 % ~05s
|+++++++++ | 17% ~04s
|+++++++++++++ | 25% ~03s
|+++++++++++++++++ | 33% ~03s
|+++++++++++++++++++++ | 42% ~02s
|+++++++++++++++++++++++++ | 50% ~02s
|++++++++++++++++++++++++++++++ | 58% ~02s
|++++++++++++++++++++++++++++++++++ | 67% ~01s
|++++++++++++++++++++++++++++++++++++++ | 75% ~01s
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++ | 92% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=04s
Computing within dataset neighborhoods
| | 0 % ~calculating
|+++++ | 8 % ~07s
|+++++++++ | 17% ~06s
|+++++++++++++ | 25% ~05s
|+++++++++++++++++ | 33% ~04s
|+++++++++++++++++++++ | 42% ~04s
|+++++++++++++++++++++++++ | 50% ~03s
|++++++++++++++++++++++++++++++ | 58% ~03s
|++++++++++++++++++++++++++++++++++ | 67% ~02s
|++++++++++++++++++++++++++++++++++++++ | 75% ~02s
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++ | 92% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=06s
Finding all pairwise anchors
| | 0 % ~calculating
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 2868 anchors
|+ | 2 % ~03m 20s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 545 anchors
|++ | 3 % ~03m 01s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 533 anchors
|+++ | 5 % ~02m 54s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 558 anchors
|++++ | 6 % ~02m 53s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 549 anchors
|++++ | 8 % ~02m 56s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 424 anchors
|+++++ | 9 % ~02m 50s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 467 anchors
|++++++ | 11% ~02m 46s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 427 anchors
|+++++++ | 12% ~02m 45s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 340 anchors
|+++++++ | 14% ~02m 41s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 573 anchors
|++++++++ | 15% ~02m 38s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 915 anchors
|+++++++++ | 17% ~02m 38s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 912 anchors
|++++++++++ | 18% ~02m 37s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 519 anchors
|++++++++++ | 20% ~02m 34s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 766 anchors
|+++++++++++ | 21% ~02m 33s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 738 anchors
|++++++++++++ | 23% ~02m 31s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 531 anchors
|+++++++++++++ | 24% ~02m 27s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 505 anchors
|+++++++++++++ | 26% ~02m 24s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 464 anchors
|++++++++++++++ | 27% ~02m 20s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 518 anchors
|+++++++++++++++ | 29% ~02m 17s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 782 anchors
|++++++++++++++++ | 30% ~02m 14s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 892 anchors
|++++++++++++++++ | 32% ~02m 12s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 549 anchors
|+++++++++++++++++ | 33% ~02m 09s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 493 anchors
|++++++++++++++++++ | 35% ~02m 05s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 432 anchors
|+++++++++++++++++++ | 36% ~02m 01s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 2894 anchors
|+++++++++++++++++++ | 38% ~01m 58s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 569 anchors
|++++++++++++++++++++ | 39% ~01m 55s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 713 anchors
|+++++++++++++++++++++ | 41% ~01m 52s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 494 anchors
|++++++++++++++++++++++ | 42% ~01m 49s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 798 anchors
|++++++++++++++++++++++ | 44% ~01m 46s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 724 anchors
|+++++++++++++++++++++++ | 45% ~01m 43s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 398 anchors
|++++++++++++++++++++++++ | 47% ~01m 39s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 456 anchors
|+++++++++++++++++++++++++ | 48% ~01m 37s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 415 anchors
|+++++++++++++++++++++++++ | 50% ~01m 34s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 931 anchors
|++++++++++++++++++++++++++ | 52% ~01m 31s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 570 anchors
|+++++++++++++++++++++++++++ | 53% ~01m 28s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 419 anchors
|++++++++++++++++++++++++++++ | 55% ~01m 25s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 553 anchors
|+++++++++++++++++++++++++++++ | 56% ~01m 22s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 573 anchors
|+++++++++++++++++++++++++++++ | 58% ~01m 19s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 2037 anchors
|++++++++++++++++++++++++++++++ | 59% ~01m 16s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 409 anchors
|+++++++++++++++++++++++++++++++ | 61% ~01m 13s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 413 anchors
|++++++++++++++++++++++++++++++++ | 62% ~01m 10s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 566 anchors
|++++++++++++++++++++++++++++++++ | 64% ~01m 07s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 472 anchors
|+++++++++++++++++++++++++++++++++ | 65% ~01m 04s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 401 anchors
|++++++++++++++++++++++++++++++++++ | 67% ~01m 01s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 365 anchors
|+++++++++++++++++++++++++++++++++++ | 68% ~58s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 847 anchors
|+++++++++++++++++++++++++++++++++++ | 70% ~56s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 835 anchors
|++++++++++++++++++++++++++++++++++++ | 71% ~53s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 475 anchors
|+++++++++++++++++++++++++++++++++++++ | 73% ~50s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 772 anchors
|++++++++++++++++++++++++++++++++++++++ | 74% ~47s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 740 anchors
|++++++++++++++++++++++++++++++++++++++ | 76% ~45s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 3274 anchors
|+++++++++++++++++++++++++++++++++++++++ | 77% ~42s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 796 anchors
|++++++++++++++++++++++++++++++++++++++++ | 79% ~39s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 717 anchors
|+++++++++++++++++++++++++++++++++++++++++ | 80% ~37s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 811 anchors
|+++++++++++++++++++++++++++++++++++++++++ | 82% ~34s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 529 anchors
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~31s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 1402 anchors
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~28s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 1446 anchors
|++++++++++++++++++++++++++++++++++++++++++++ | 86% ~25s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 530 anchors
|++++++++++++++++++++++++++++++++++++++++++++ | 88% ~22s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 582 anchors
|+++++++++++++++++++++++++++++++++++++++++++++ | 89% ~19s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 443 anchors
|++++++++++++++++++++++++++++++++++++++++++++++ | 91% ~17s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 1047 anchors
|+++++++++++++++++++++++++++++++++++++++++++++++ | 92% ~14s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 405 anchors
|+++++++++++++++++++++++++++++++++++++++++++++++ | 94% ~11s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 483 anchors
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~08s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 595 anchors
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~06s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 570 anchors
|++++++++++++++++++++++++++++++++++++++++++++++++++| 98% ~03s
Projecting new data onto SVD
Projecting new data onto SVD
Finding neighborhoods
Finding anchors
Found 1021 anchors
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=03m 01s
integrated_seurat <- IntegrateData(anchorset = int.anchors, dims = 1:30)
Merging dataset 8 into 4
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Merging dataset 11 into 6
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Merging dataset 1 into 2
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Merging dataset 10 into 3
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Merging dataset 12 into 2 1
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Merging dataset 9 into 6 11
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Merging dataset 2 1 12 into 6 11 9
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Merging dataset 7 into 5
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Merging dataset 3 10 into 4 8
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Merging dataset 4 8 3 10 into 6 11 9 2 1 12
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
Merging dataset 5 7 into 6 11 9 2 1 12 4 8 3 10
Extracting anchors for merged samples
Finding integration vectors
Finding integration vector weights
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Integrating data
#
# must set the k weight to the lowest cell count
# in the parse sample we have over 1530 cells in the smallest count so we don't have to change the k from the 100 default
Now we need to run the workflow on the integrated object
DefaultAssay(integrated_seurat) <- "integrated"
integrated_seurat <- ScaleData(integrated_seurat, verbose = FALSE)
# only the integrated features will be the pca input
integrated_seurat <- RunPCA(integrated_seurat, npcs = 20, verbose = FALSE)
integrated_seurat <- RunUMAP(integrated_seurat, reduction = "pca", dims = 1:20, n.neighbors = 81)
Have a look at the new UMAP
DimPlot(integrated_seurat, group.by = 'sample')
DimPlot(integrated_seurat, group.by = 'Batch')
DimPlot(integrated_seurat, group.by = 'DiseaseStatus')
DimPlot(integrated_seurat, group.by = 'IPSC_Line')
NA
NA
NA
# saveRDS(integrated_seurat, "Integrated12samples.RDS")
# setwd("~/Documents/Data/scRNAseq/ParseExample/Experiment1-mini12")
intergrated_seurat <- readRDS("/Users/rhalenathomas/Documents/Data/scRNAseq/ParseExample/Experiment1-mini12/Integrated12samples.RDS")
Find new clusters
integrated_seurat <- FindClusters(integrated_seurat, resolution = c(0,0.3,0.6,1) )
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 23769
Number of edges: 3304295
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 1.0000
Number of communities: 1
Elapsed time: 16 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 23769
Number of edges: 3304295
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.9097
Number of communities: 10
Elapsed time: 20 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 23769
Number of edges: 3304295
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8752
Number of communities: 16
Elapsed time: 23 seconds
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 23769
Number of edges: 3304295
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8382
Number of communities: 20
Elapsed time: 20 seconds
integrated_seurat <- intergrated_seurat
DimPlot(integrated_seurat, group.by = "integrated_snn_res.0.3")
DimPlot(integrated_seurat, group.by = "integrated_snn_res.0.6")
DimPlot(integrated_seurat, group.by = "integrated_snn_res.0.3", split.by = "DiseaseStatus")
table(integrated_seurat$DiseaseStatus)
Con PD
12502 11267
Annotate clusters res 0.3
write(ClusterMarkers, "clusterMarkersres03Integrated.csv")
Error in cat(x, file = file, sep = c(rep.int(sep, ncolumns - 1), "\n"), :
argument 1 (type 'list') cannot be handled by 'cat'
library(enrichR)
Welcome to enrichR
Checking connection ...
Enrichr ... Connection is Live!
FlyEnrichr ... Connection is available!
WormEnrichr ... Connection is available!
YeastEnrichr ... Connection is available!
FishEnrichr ... Connection is available!
OxEnrichr ... Connection is available!
setEnrichrSite("Enrichr") # Human genes
Connection changed to https://maayanlab.cloud/Enrichr/
Connection is Live!
# list of all the databases
# get the possible libraries
dbs <- listEnrichrDbs()
# this will list the possible libraries
dbs
# select libraries with cell types
db <- c('CellMarker_Augmented_2021','Azimuth_Cell_Types_2021')
# function for a quick look
checkCelltypes <- function(cluster_num = 0){
clusterX <- ClusterMarkers %>% filter(cluster == cluster_num & avg_log2FC > 0.25)
genes <- clusterX$gene
# the cell type libraries
# get the results for each library
clusterX.cell <- enrichr(genes, databases = db)
# visualize the results
print(plotEnrich(clusterX.cell[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value", title = 'CellMarker_Augmented_2021'))
print(plotEnrich(clusterX.cell[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value", title = 'Azimuth_Cell_Types_2021'))
}
#heatmap of top markers
top3 <- ClusterMarkers %>% group_by(cluster) %>% top_n(n=3, wt =avg_log2FC)
DoHeatmap(integrated_seurat, features = top3$gene, size = 3, angle = 90, group.by = "integrated_snn_res.0.3")
NA
NA
table(ClusterMarkers$cluster)
0 1 2 3 4 5 6 7 8 9
130 38 242 477 134 249 275 417 377 233
Check each cluster quickly
checkCelltypes(cluster_num = 3)
Uploading data to Enrichr... Done.
Querying CellMarker_Augmented_2021... Done.
Querying Azimuth_Cell_Types_2021... Done.
Parsing results... Done.
Look at some expression lists
da_neurons <- c("TH","SLC6A3","SLC18A2","SOX6","NDNF","SNCG","ALDH1A1","CALB1","TACR2","SLC17A6","SLC32A1","OTX2","GRP","LPL","CCK","VIP")
NPC_orStemLike <- c("DCX","NEUROD1","TBR1","PCNA","MKI67","SOX2","NES","PAX6","MASH1")
mature_neurons = c("RBFOX3","SYP","DLG45","VAMP1","VAMP2","TUBB3","SYT1","BSN","HOMER1","SLC17A6")
excitatory_neurons = c("GRIA2","GRIA1","GRIA4","GRIN1","GRIN2B","GRIN2A","GRIN3A","GRIN3","GRIP1","CAMK2A")
inhbitory_neurons = inh = c("GAD1","GAD2", "GAT1","PVALB","GABR2","GABR1","GBRR1","GABRB2","GABRB1","GABRB3","GABRA6","GABRA1","GABRA4","TRAK2")
astrocytes <- c("GFAP","S100B","AQP4","APOE", "SOX9","SLC1A3")
oligodendrocytes <- c("MBP","MOG","OLIG1","OLIG2","SOX10")
opc <-
radial_glia <- c("PTPRC","AIF1","ADGRE1", "VIM", "TNC","PTPRZ1","FAM107A","HOPX","LIFR",
"ITGB5","IL6ST","SLC1A3")
epithelial <- c("HES1","HES5","SOX2","SOX10","NES","CDH1","NOTCH1")
microglia <- c("IBA1","P2RY12","P2RY13","TREM119", "GPR34","SIGLECH","TREM2",
"CX3CR1","FCRLS","OLFML3","HEXB","TGFBR1", "SALL1","MERTK",
"PROS1")
features_list <- c("MKI67","SOX2","POU5F1","DLX2","PAX6","SOX9","HES1","NES","RBFOX3","MAP2","NCAM1","CD24","GRIA2","GRIN2B","GABBR1","GAD1","GAD2","GABRA1","GABRB2","TH","ALDH1A1","LMX1B","NR4A2","CORIN","CALB1","KCNJ6","CXCR4","ITGA6","SLC1A3","CD44","AQP4","S100B", "PDGFRA","OLIG2","MBP","CLDN11","VIM","VCAM1")
short_list <- c("MKI67","SOX9","HES1","NES","DLX2","RBFOX3","MAP2","TH","CALB1","KCNJ6","SLC1A3","CD44","AQP4","S100B","OLIG2","MBP","VIM")
Idents(integrated_seurat) <- "integrated_snn_res.0.3"
for (i in da_neurons) {
print(FeaturePlot(integrated_seurat, features = i, min.cutoff = 'q1', max.cutoff = 'q97', label = TRUE))
}
Warning: Could not find TH in the default search locations, found in RNA assay instead
Warning: Could not find SLC6A3 in the default search locations, found in RNA assay instead
Warning: Could not find SLC18A2 in the default search locations, found in RNA assay instead
Warning: Could not find SNCG in the default search locations, found in RNA assay instead
Warning: Could not find ALDH1A1 in the default search locations, found in RNA assay instead
Warning: Could not find TACR2 in the default search locations, found in RNA assay instead
Warning: Could not find SLC32A1 in the default search locations, found in RNA assay instead
Warning: Could not find OTX2 in the default search locations, found in RNA assay instead
Warning: Could not find GRP in the default search locations, found in RNA assay instead
Warning: Could not find CCK in the default search locations, found in RNA assay instead
Warning: Could not find VIP in the default search locations, found in RNA assay instead
Idents(integrated_seurat) <- "integrated_snn_res.0.3"
for (i in NPC_orStemLike) {
print(FeaturePlot(integrated_seurat, features = i, min.cutoff = 'q1', max.cutoff = 'q97', label = TRUE))
}
Warning: Could not find PCNA in the default search locations, found in RNA assay instead
Warning: Could not find SOX2 in the default search locations, found in RNA assay instead
Warning: Could not find NES in the default search locations, found in RNA assay instead
Warning in FetchData.Seurat(object = object, vars = c(dims, "ident", features), :
The following requested variables were not found: MASH1
Error: None of the requested features were found: MASH1 in slot data
Idents(integrated_seurat) <- "integrated_snn_res.0.3"
for (i in astrocytes) {
print(FeaturePlot(integrated_seurat, features = i, min.cutoff = 'q1', max.cutoff = 'q97', label = TRUE))
}
Warning: Could not find GFAP in the default search locations, found in RNA assay instead
Warning: Could not find S100B in the default search locations, found in RNA assay instead
Warning: Could not find AQP4 in the default search locations, found in RNA assay instead
Warning: Could not find APOE in the default search locations, found in RNA assay instead
Warning: Could not find SOX9 in the default search locations, found in RNA assay instead
Idents(integrated_seurat) <- "integrated_snn_res.0.3"
for (i in radial_glia) {
print(FeaturePlot(integrated_seurat, features = i, min.cutoff = 'q1', max.cutoff = 'q97', label = TRUE))
}
Warning: Could not find PTPRC in the default search locations, found in RNA assay instead
Warning: Could not find AIF1 in the default search locations, found in RNA assay instead
Warning: Could not find ADGRE1 in the default search locations, found in RNA assay instead
Warning: Could not find VIM in the default search locations, found in RNA assay instead
Warning: Could not find PTPRZ1 in the default search locations, found in RNA assay instead
Warning: Could not find FAM107A in the default search locations, found in RNA assay instead
Warning: Could not find HOPX in the default search locations, found in RNA assay instead
Warning: Could not find ITGB5 in the default search locations, found in RNA assay instead
Warning: Could not find IL6ST in the default search locations, found in RNA assay instead
Idents(integrated_seurat) <- "integrated_snn_res.0.3"
for (i in mature_neurons) {
print(FeaturePlot(integrated_seurat, features = i, min.cutoff = 'q1', max.cutoff = 'q97', label = TRUE))
}
Warning: Could not find SYP in the default search locations, found in RNA assay instead
Warning in FetchData.Seurat(object = object, vars = c(dims, "ident", features), :
The following requested variables were not found: DLG45
Error: None of the requested features were found: DLG45 in slot data
Idents(integrated_seurat) <- "integrated_snn_res.0.3"
for (i in excitatory_neurons) {
print(FeaturePlot(integrated_seurat, features = i, min.cutoff = 'q1', max.cutoff = 'q97', label = TRUE))
}
Warning: Could not find GRIN1 in the default search locations, found in RNA assay instead
Warning in FetchData.Seurat(object = object, vars = c(dims, "ident", features), :
The following requested variables were not found: GRIN3
Error: None of the requested features were found: GRIN3 in slot data
Add annotations - first pass NPC-stem NPC-glia NPC-SOX6 Neurons-Glut Progenitors-div NPC-SOX2-OXT-fibro Neural-Stem stem cell Neuron-GABA Neuron-epithelial
celltypes2 <- c("NPC","NPC","NPC","Neurons","NPC",
"NPC","NPC","Stem","Neurons","Epithelial")
integrated_seurat <- annotate(integrated_seurat, annotations = celltypes2, to_label = "integrated_snn_res.0.3",annotation_name = "Celltypes2")
DimPlot(integrated_seurat, label = TRUE)
DimPlot(integrated_seurat, split.by = "DiseaseStatus")
DimPlot(integrated_seurat, split.by = "DiseaseStatus", group.by = "Celltypes1")
NA
NA
celltypes3 <- c("NPC","NPC","NPC","Neurons","NPC-div",
"Neuro-NPC","Neural-Stem","Stem","Neurons","Neural-epi")
integrated_seurat <- annotate(integrated_seurat, annotations = celltypes3, to_label = "integrated_snn_res.0.3",annotation_name = "Celltypes3")
DimPlot(integrated_seurat, label = TRUE)
table(integrated_seurat$IPSC_Line)
TD07 TD22 x2965 x3448
5310 6378 5957 6124
DEG in cell types 3 groups
DGE <- FindMarkers(seu_sub, ident.1 = "PD", ident.2 = "Con")
| | 0 % ~calculating
|+ | 1 % ~12s
|++ | 2 % ~13s
|++ | 3 % ~13s
|+++ | 4 % ~13s
|+++ | 5 % ~13s
|++++ | 6 % ~13s
|++++ | 7 % ~13s
|+++++ | 9 % ~12s
|+++++ | 10% ~12s
|++++++ | 11% ~12s
|++++++ | 12% ~12s
|+++++++ | 13% ~12s
|+++++++ | 14% ~12s
|++++++++ | 15% ~12s
|++++++++ | 16% ~12s
|+++++++++ | 17% ~12s
|++++++++++ | 18% ~11s
|++++++++++ | 19% ~11s
|+++++++++++ | 20% ~11s
|+++++++++++ | 21% ~11s
|++++++++++++ | 22% ~11s
|++++++++++++ | 23% ~11s
|+++++++++++++ | 24% ~10s
|+++++++++++++ | 26% ~10s
|++++++++++++++ | 27% ~10s
|++++++++++++++ | 28% ~10s
|+++++++++++++++ | 29% ~10s
|+++++++++++++++ | 30% ~10s
|++++++++++++++++ | 31% ~10s
|++++++++++++++++ | 32% ~09s
|+++++++++++++++++ | 33% ~09s
|++++++++++++++++++ | 34% ~09s
|++++++++++++++++++ | 35% ~09s
|+++++++++++++++++++ | 36% ~09s
|+++++++++++++++++++ | 37% ~09s
|++++++++++++++++++++ | 38% ~09s
|++++++++++++++++++++ | 39% ~08s
|+++++++++++++++++++++ | 40% ~08s
|+++++++++++++++++++++ | 41% ~08s
|++++++++++++++++++++++ | 43% ~08s
|++++++++++++++++++++++ | 44% ~08s
|+++++++++++++++++++++++ | 45% ~08s
|+++++++++++++++++++++++ | 46% ~07s
|++++++++++++++++++++++++ | 47% ~07s
|++++++++++++++++++++++++ | 48% ~07s
|+++++++++++++++++++++++++ | 49% ~07s
|+++++++++++++++++++++++++ | 50% ~07s
|++++++++++++++++++++++++++ | 51% ~07s
|+++++++++++++++++++++++++++ | 52% ~07s
|+++++++++++++++++++++++++++ | 53% ~06s
|++++++++++++++++++++++++++++ | 54% ~06s
|++++++++++++++++++++++++++++ | 55% ~06s
|+++++++++++++++++++++++++++++ | 56% ~06s
|+++++++++++++++++++++++++++++ | 57% ~06s
|++++++++++++++++++++++++++++++ | 59% ~06s
|++++++++++++++++++++++++++++++ | 60% ~05s
|+++++++++++++++++++++++++++++++ | 61% ~05s
|+++++++++++++++++++++++++++++++ | 62% ~05s
|++++++++++++++++++++++++++++++++ | 63% ~05s
|++++++++++++++++++++++++++++++++ | 64% ~05s
|+++++++++++++++++++++++++++++++++ | 65% ~05s
|+++++++++++++++++++++++++++++++++ | 66% ~05s
|++++++++++++++++++++++++++++++++++ | 67% ~04s
|+++++++++++++++++++++++++++++++++++ | 68% ~04s
|+++++++++++++++++++++++++++++++++++ | 69% ~04s
|++++++++++++++++++++++++++++++++++++ | 70% ~04s
|++++++++++++++++++++++++++++++++++++ | 71% ~04s
|+++++++++++++++++++++++++++++++++++++ | 72% ~04s
|+++++++++++++++++++++++++++++++++++++ | 73% ~04s
|++++++++++++++++++++++++++++++++++++++ | 74% ~03s
|++++++++++++++++++++++++++++++++++++++ | 76% ~03s
|+++++++++++++++++++++++++++++++++++++++ | 77% ~03s
|+++++++++++++++++++++++++++++++++++++++ | 78% ~03s
|++++++++++++++++++++++++++++++++++++++++ | 79% ~03s
|++++++++++++++++++++++++++++++++++++++++ | 80% ~03s
|+++++++++++++++++++++++++++++++++++++++++ | 81% ~03s
|+++++++++++++++++++++++++++++++++++++++++ | 82% ~02s
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~02s
|+++++++++++++++++++++++++++++++++++++++++++ | 84% ~02s
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~02s
|++++++++++++++++++++++++++++++++++++++++++++ | 86% ~02s
|++++++++++++++++++++++++++++++++++++++++++++ | 87% ~02s
|+++++++++++++++++++++++++++++++++++++++++++++ | 88% ~02s
|+++++++++++++++++++++++++++++++++++++++++++++ | 89% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++ | 90% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++ | 91% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 93% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 94% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 96% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=13s
DimPlot(seu_sub, group.by = "DiseaseStatus")
table(seu_sub$DiseaseStatus)
Con PD
1746 1647
Neurons DGE volcano plot
library(EnhancedVolcano)
EnhancedVolcano(DGE,
lab = rownames(DGE),
#xlim = c(-0.25,0.25),
x = 'avg_log2FC',
y = 'p_val_adj',
pCutoff = 0.000001,
FCcutoff = 1,
pointSize = 3.0,
labSize = 6.0)
NA
NA
NA
head(up)
Error in head(up) : object 'up' not found
Create a sum of counts data table
Try to separately variables in the aggregation
colnames(integrated_seurat@meta.data)
[1] "orig.ident" "nCount_RNA"
[3] "nFeature_RNA" "sample"
[5] "species" "gene_count"
[7] "tscp_count" "tscp_count_50dup"
[9] "read_count" "bc1_well"
[11] "bc2_well" "bc3_well"
[13] "bc1_wind" "bc2_wind"
[15] "bc3_wind" "percent.mt"
[17] "RNA_snn_res.0.5" "RNA_snn_res.0.6"
[19] "RNA_snn_res.0.8" "seurat_clusters"
[21] "sample.test" "Batch"
[23] "IPSC_Line" "DiseaseStatus"
[25] "integrated_snn_res.0" "integrated_snn_res.0.3"
[27] "integrated_snn_res.0.6" "integrated_snn_res.1"
[29] "Celltypes1" "Celltypes2"
[31] "Celltypes3"
sum_counts <- AggregateExpression(integrated_seurat, assay = "RNA", group.by = c("Batch","IPSC_Line","DiseaseStatus"))
# this seems to group by the group and the active ident together
colnames(integrated_seurat@meta.data)
[1] "orig.ident" "nCount_RNA"
[3] "nFeature_RNA" "sample"
[5] "species" "gene_count"
[7] "tscp_count" "tscp_count_50dup"
[9] "read_count" "bc1_well"
[11] "bc2_well" "bc3_well"
[13] "bc1_wind" "bc2_wind"
[15] "bc3_wind" "percent.mt"
[17] "RNA_snn_res.0.5" "RNA_snn_res.0.6"
[19] "RNA_snn_res.0.8" "seurat_clusters"
[21] "sample.test" "Batch"
[23] "IPSC_Line" "DiseaseStatus"
[25] "integrated_snn_res.0" "integrated_snn_res.0.3"
[27] "integrated_snn_res.0.6" "integrated_snn_res.1"
[29] "Celltypes1" "Celltypes2"
[31] "Celltypes3"
dim(sum_counts$RNA)
[1] 58395 12
sum_counts_df <- as.data.frame(sum_counts$RNA)
dim(sum_counts_df)
[1] 58395 12
Try to make the MDS plots and the DESeq2 DGE
library( "DESeq2" )
Warning: package ‘DESeq2’ was built under R version 4.2.2
Loading required package: S4Vectors
Warning: package ‘S4Vectors’ was built under R version 4.2.2
Loading required package: stats4
Loading required package: BiocGenerics
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:dplyr’:
combine, intersect, setdiff, union
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, aperm, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter,
Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
mapply, match, mget, order, paste, pmax, pmax.int, pmin,
pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
setdiff, sort, table, tapply, union, unique, unsplit,
which.max, which.min
Attaching package: ‘S4Vectors’
The following objects are masked from ‘package:dplyr’:
first, rename
The following object is masked from ‘package:tidyr’:
expand
The following objects are masked from ‘package:base’:
expand.grid, I, unname
Loading required package: IRanges
Attaching package: ‘IRanges’
The following objects are masked from ‘package:dplyr’:
collapse, desc, slice
The following object is masked from ‘package:purrr’:
reduce
Loading required package: GenomicRanges
Warning: package ‘GenomicRanges’ was built under R version 4.2.2
Loading required package: GenomeInfoDb
Warning: package ‘GenomeInfoDb’ was built under R version 4.2.2
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats
Attaching package: ‘matrixStats’
The following object is masked from ‘package:dplyr’:
count
Attaching package: ‘MatrixGenerics’
The following objects are masked from ‘package:matrixStats’:
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
colMads, colMaxs, colMeans2, colMedians, colMins,
colOrderStats, colProds, colQuantiles, colRanges, colRanks,
colSdDiffs, colSds, colSums2, colTabulates, colVarDiffs,
colVars, colWeightedMads, colWeightedMeans,
colWeightedMedians, colWeightedSds, colWeightedVars, rowAlls,
rowAnyNAs, rowAnys, rowAvgsPerColSet, rowCollapse, rowCounts,
rowCummaxs, rowCummins, rowCumprods, rowCumsums, rowDiffs,
rowIQRDiffs, rowIQRs, rowLogSumExps, rowMadDiffs, rowMads,
rowMaxs, rowMeans2, rowMedians, rowMins, rowOrderStats,
rowProds, rowQuantiles, rowRanges, rowRanks, rowSdDiffs,
rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
rowWeightedSds, rowWeightedVars
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages
'citation("pkgname")'.
Attaching package: ‘Biobase’
The following object is masked from ‘package:MatrixGenerics’:
rowMedians
The following objects are masked from ‘package:matrixStats’:
anyMissing, rowMedians
Attaching package: ‘SummarizedExperiment’
The following object is masked from ‘package:SeuratObject’:
Assays
The following object is masked from ‘package:Seurat’:
Assays
library(ggplot2)
Read back in the aggregated values (grouped by sample for all clusters)
I need to prepare for DEGseq2
t.df <- t(df.all)
t.df[1:3,1:5]
TSPAN6 TNMD DPM1 SCYL3 C1orf112
B1_TD07_PD 226.8531 3.510116 539.7336 354.7319 599.9776
B1_TD22_Con 215.2970 1.233656 510.8297 298.1682 422.9308
B1_x2965_PD 398.1948 4.792417 1022.1249 440.1952 1009.8998
Now add the meta data
Prepare objects for DEGseq2
library( "DESeq2" )
Warning: package ‘DESeq2’ was built under R version 4.2.2
Loading required package: S4Vectors
Warning: package ‘S4Vectors’ was built under R version 4.2.2
Loading required package: stats4
Loading required package: BiocGenerics
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:dplyr’:
combine, intersect, setdiff, union
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, aperm, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter,
Find, get, grep, grepl, intersect, is.unsorted, lapply, Map,
mapply, match, mget, order, paste, pmax, pmax.int, pmin,
pmin.int, Position, rank, rbind, Reduce, rownames, sapply,
setdiff, sort, table, tapply, union, unique, unsplit, which.max,
which.min
Attaching package: ‘S4Vectors’
The following object is masked from ‘package:tidyr’:
expand
The following objects are masked from ‘package:dplyr’:
first, rename
The following objects are masked from ‘package:base’:
expand.grid, I, unname
Loading required package: IRanges
Attaching package: ‘IRanges’
The following object is masked from ‘package:purrr’:
reduce
The following objects are masked from ‘package:dplyr’:
collapse, desc, slice
Loading required package: GenomicRanges
Warning: package ‘GenomicRanges’ was built under R version 4.2.2
Loading required package: GenomeInfoDb
Warning: package ‘GenomeInfoDb’ was built under R version 4.2.2
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats
Attaching package: ‘matrixStats’
The following object is masked from ‘package:dplyr’:
count
Attaching package: ‘MatrixGenerics’
The following objects are masked from ‘package:matrixStats’:
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
colWeightedMeans, colWeightedMedians, colWeightedSds,
colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs,
rowVars, rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
rowWeightedSds, rowWeightedVars
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Attaching package: ‘Biobase’
The following object is masked from ‘package:MatrixGenerics’:
rowMedians
The following objects are masked from ‘package:matrixStats’:
anyMissing, rowMedians
# the input is a dataframe with only expression values no meta data
# the dataframe gets transposed back so the initial DF all is correct
dft <- df.all # columns are samples and genes are rows
#prepare the dds object
# we need the transposed data frame no meta data
dfi <- lapply(dft, as.integer)
dfi <- as.data.frame(dfi)
rownames(dfi) <- rownames(dft)
# here we need the meta data
df.meta <- sample
# and we need what variable to compare
dds <- DESeqDataSetFromMatrix(countData = dfi, colData = df.meta, design = ~DiseaseStatus)
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
# I ran this originally with all data
# see the object
dds
class: DESeqDataSet
dim: 58395 12
metadata(1): version
assays(1): counts
rownames(58395): TSPAN6 TNMD ... AP000646.1 AP006216.3
rowData names(0):
colnames(12): B1_TD07_PD B1_TD22_Con ... B3_x2965_PD B3_x3448_Con
colData names(4): sample Batch Line DiseaseStatus
Now we run the DESEQ function
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Look at the results
res <- results(dds)
head(results(dds, tidy= TRUE))
NA
summary(res)
out of 36221 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 351, 0.97%
LFC < 0 (down) : 567, 1.6%
outliers [1] : 156, 0.43%
low counts [2] : 12228, 34%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
library(EnhancedVolcano)
EnhancedVolcano(res,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue',
title = 'All cell types NPCs',
pCutoff = 0.001,
FCcutoff = 2.5,
pointSize = 5.0,
labSize = 5,
legendLabSize = 15,
legendIconSize = 2)
plotPCA(vsdata, intgroup="DiseaseStatus")
plotPCA(vsdata, intgroup="Line")
plotPCA(vsdata, intgroup="Batch")
plotPCA(vsdata, intgroup="sample")
DESeq2 analysis from the cell type separate aggregated data
df <- df %>% select(-"X")
colnames(df)
[1] "NPC_TD07B1" "NPC_TD07B2" "NPC_TD07B3"
[4] "NPC_TD22B1" "NPC_TD22B2" "NPC_TD22B3"
[7] "NPC_x2965B1" "NPC_x2965B2" "NPC_x2965B3"
[10] "NPC_x3448B1" "NPC_x3448B2" "NPC_x3448B3"
[13] "Neurons_TD07B1" "Neurons_TD07B2" "Neurons_TD07B3"
[16] "Neurons_TD22B1" "Neurons_TD22B2" "Neurons_TD22B3"
[19] "Neurons_x2965B1" "Neurons_x2965B2" "Neurons_x2965B3"
[22] "Neurons_x3448B1" "Neurons_x3448B2" "Neurons_x3448B3"
[25] "NPC.div_TD07B1" "NPC.div_TD07B2" "NPC.div_TD07B3"
[28] "NPC.div_TD22B1" "NPC.div_TD22B2" "NPC.div_TD22B3"
[31] "NPC.div_x2965B1" "NPC.div_x2965B2" "NPC.div_x2965B3"
[34] "NPC.div_x3448B1" "NPC.div_x3448B2" "NPC.div_x3448B3"
[37] "Neuro.NPC_TD07B1" "Neuro.NPC_TD07B2" "Neuro.NPC_TD07B3"
[40] "Neuro.NPC_TD22B1" "Neuro.NPC_TD22B2" "Neuro.NPC_TD22B3"
[43] "Neuro.NPC_x2965B1" "Neuro.NPC_x2965B2" "Neuro.NPC_x2965B3"
[46] "Neuro.NPC_x3448B1" "Neuro.NPC_x3448B2" "Neuro.NPC_x3448B3"
[49] "Neural.Stem_TD07B1" "Neural.Stem_TD07B2" "Neural.Stem_TD07B3"
[52] "Neural.Stem_TD22B1" "Neural.Stem_TD22B2" "Neural.Stem_TD22B3"
[55] "Neural.Stem_x2965B1" "Neural.Stem_x2965B2" "Neural.Stem_x2965B3"
[58] "Neural.Stem_x3448B1" "Neural.Stem_x3448B2" "Neural.Stem_x3448B3"
[61] "Stem_TD07B1" "Stem_TD07B2" "Stem_TD07B3"
[64] "Stem_TD22B1" "Stem_TD22B2" "Stem_TD22B3"
[67] "Stem_x2965B1" "Stem_x2965B2" "Stem_x2965B3"
[70] "Stem_x3448B2" "Neural.epi_TD22B3" "Neural.epi_x3448B1"
[73] "Neural.epi_x3448B2" "Neural.epi_x3448B3"
Make the metadata
Now set up for the DEG
dds <- DESeqDataSetFromMatrix(countData = dfi, colData = df.meta, design = ~DiseaseStatus)
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
# see the object
dds
class: DESeqDataSet
dim: 58395 74
metadata(1): version
assays(1): counts
rownames(58395): TSPAN6 TNMD ... AP000646.1 AP006216.3
rowData names(0):
colnames(74): NPC_TD07B1 NPC_TD07B2 ... Neural.epi_x3448B2
Neural.epi_x3448B3
colData names(5): Sample Celltype Line Batch DiseaseStatus
# run DeqSeq2
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 373 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
# see results
res <- results(dds)
head(results(dds, tidy= TRUE))
summary(res)
out of 36120 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 727, 2%
LFC < 0 (down) : 1254, 3.5%
outliers [1] : 0, 0%
low counts [2] : 16262, 45%
(mean count < 0)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
PCA plots
vsdata <- varianceStabilizingTransformation(dds)
plotPCA(vsdata, intgroup="DiseaseStatus")
plotPCA(vsdata, intgroup="Line")
plotPCA(vsdata, intgroup="Batch")
plotPCA(vsdata, intgroup="Sample")
plotPCA(vsdata, intgroup = "Celltype")
Subset full dataframe by cell type
# Assuming df.trans is your transposed dataframe and df.meta is your metadata dataframe
# Load necessary library
#library(DESeq2)
# Extract unique cell types from the "Celltype" column in df.meta
Celltypes <- unique(df.meta$Celltype)
print(Celltypes)
[1] "NPC" "Neurons" "NPC.div" "Neuro.NPC" "Neural.Stem"
[6] "Stem" "Neural.epi"
# Create an empty list to store DESeq results for each cell type
list.results <- list()
df.trans <- as.data.frame(t(df))
dim(df.trans)
[1] 74 58395
#df.trans[1:3,1:5]
#dim(df.meta)
# Loop through each cell type and perform DESeq analysis
for (i in Celltypes) {
# Subset the expression dataframe by the current cell type
print(i)
df_sub <- df.trans[grepl(paste0("^",i,"_"), rownames(df.trans)), ]
print(dim(df_sub))
df.meta_sub <- df.meta[df.meta$Celltype == i, ]
print(dim(df.meta_sub))
# Prepare the DESeq object
dft <- as.data.frame(t(df_sub)) # Transpose the subset dataframe to get genes as rows and samples as columns
dfi <- lapply(dft, as.integer)
dfi <- as.data.frame(dfi)
rownames(dfi) <- rownames(dft)
# Create the DESeqDataSet object using the subset dataframe and metadata
dds <- DESeqDataSetFromMatrix(countData = dfi, colData = df.meta_sub, design = ~DiseaseStatus)
# Perform DESeq analysis
dds <- DESeq(dds)
# Store the DESeq results in the list with the cell type as the list index
res <- results(dds)
list2 <- list()
list2[["dds"]] <- dds
list2[["results"]] <- res
list.results[[i]] <- list2
}
[1] "NPC"
[1] 12 58395
[1] 12 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neurons"
[1] 12 58395
[1] 12 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "NPC.div"
[1] 12 58395
[1] 12 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neuro.NPC"
[1] 12 58395
[1] 12 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neural.Stem"
[1] 12 58395
[1] 12 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Stem"
[1] 10 58395
[1] 10 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neural.epi"
[1] 4 58395
[1] 4 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
Error in DESeqDataSet(se, design = design, ignoreRank) :
design has a single variable, with all samples having the same value.
use instead a design of '~ 1'. estimateSizeFactors, rlog and the VST can then be used
A look at the results
res <- results(dds)
head(results(dds, tidy= TRUE))
summary(res)
out of 34278 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 266, 0.78%
LFC < 0 (down) : 469, 1.4%
outliers [1] : 166, 0.48%
low counts [2] : 12780, 37%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
dds <- list.results$NPC$dds
vsdata <- varianceStabilizingTransformation(dds)
plotPCA(vsdata, intgroup="DiseaseStatus")
plotPCA(vsdata, intgroup="Line")
plotPCA(vsdata, intgroup="Batch")
plotPCA(vsdata, intgroup="Sample")
plotPCA(vsdata, intgroup = "Celltype")
library(EnhancedVolcano)
EnhancedVolcano(res,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue',
title = 'NPC',
ylim = c(0,20),
pCutoff = 0.01,
FCcutoff = 2.5,
pointSize = 5.0,
labSize = 5,
legendLabSize = 15,
legendIconSize = 5)
Get the DGE dataframe
Create a summary dataframe from the DGE
res1 <- as.data.frame(list.results$Neurons$results)
res1$Celltype <- "Neurons"
res1$Gene <- rownames(res1)
res.sig <- (res1 %>% filter(pvalue <= 0.05)) %>% select(c("Gene","Celltype","log2FoldChange","pvalue","padj"))
res.sig
NA
NA
NA
Some DGE analysis for NPC to compare
library(enrichR)
Welcome to enrichR
Checking connection ...
Enrichr ... Connection is Live!
FlyEnrichr ... Connection is available!
WormEnrichr ... Connection is available!
YeastEnrichr ... Connection is available!
FishEnrichr ... Connection is available!
OxEnrichr ... Connection is available!
setEnrichrSite("Enrichr") # Human genes
Connection changed to https://maayanlab.cloud/Enrichr/
Connection is Live!
# list of all the databases
# get the possible libraries
dbs <- listEnrichrDbs()
# this will list the possible libraries
dbs
# select libraries with cell types
db <- c('KEGG_2019_Human','GWAS_Catalog_2019',"GO_Biological_Process_2023",
"GO_Cellular_Component_2023","GO_Molecular_Function_2023")
# function for a quick look - the names for filtering need to match the column names
getGSA <- function(dataframe, up_or_down = "up", LFCthresh = 0.01,
pval_thresh = 0.05){
if(up_or_down == "up"){
genelist <- dataframe %>% filter(log2FoldChange >= LFCthresh & padj < pval_thresh)
}else if (up_or_down == "down") {
genelist <- dataframe %>% filter(log2FoldChange <= LFCthresh & padj < pval_thresh)
} else {
genelist <- dataframe %>% filter(padj < pval_thresh)
}
genes <- genelist$X
# the cell type libraries
# get the results for each library
results <- enrichr(genes, databases = db)
# visualize the results
print(plotEnrich(results[[1]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value", title = 'KEGG'))
print(plotEnrich(results[[2]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value", title = 'GWAS'))
print(plotEnrich(results[[3]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value", title = 'GObio'))
print(plotEnrich(results[[4]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value", title = 'GOcell'))
print(plotEnrich(results[[5]], showTerms = 20, numChar = 40, y = "Count", orderBy = "P.value", title = 'GOmol'))
return(results)
}
# use only the significant after p adjusted genes
# add genes column to DGE it is X if saved to csv
padj$X <- rownames(padj)
pseudoDGEup <- getGSA(padj, up_or_down = "up",
LFCthresh = 0.02,
pval_thresh = 0.05)
Uploading data to Enrichr... Done.
Querying KEGG_2019_Human... Done.
Querying GWAS_Catalog_2019... Done.
Querying GO_Biological_Process_2023... Done.
Querying GO_Cellular_Component_2023... Done.
Querying GO_Molecular_Function_2023... Done.
Parsing results... Done.
pseudoDGEdown <- getGSA(padj, up_or_down = "down",
LFCthresh = -0.02,
pval_thresh = 0.05)
Uploading data to Enrichr... Done.
Querying KEGG_2019_Human... Done.
Querying GWAS_Catalog_2019... Done.
Querying GO_Biological_Process_2023... Done.
Querying GO_Cellular_Component_2023... Done.
Querying GO_Molecular_Function_2023... Done.
Parsing results... Done.
pseudoDGEdown <- getGSA(padj, up_or_down = "both",
LFCthresh = 0,
pval_thresh = 0.05)
Uploading data to Enrichr... Done.
Querying KEGG_2019_Human... Done.
Querying GWAS_Catalog_2019... Done.
Querying GO_Biological_Process_2023... Done.
Querying GO_Cellular_Component_2023... Done.
Querying GO_Molecular_Function_2023... Done.
Parsing results... Done.
Compare genes in wilcoxing rank verse pseudo bulk
sc <- scDGE.NPC %>% filter(p_val_adj <= 0.05)
pb <- res1 %>% filter(padj <= 0.05)
contrast.list <- list(A= sc$X, B= pb$Gene)
# plot the VENN
names(contrast.list) <- c("single cells","pseudobulk")
ggvenn(
contrast.list,
fill_color = c("#0073C2FF", "#EFC000FF", "#868686FF", "#CD534CFF"),
stroke_size = 0.75, set_name_size = 5, show_percentage = FALSE, text_size = 5, fill_alpha = 0.75
)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `text = sprintf("%d", n, 100 * n/sum(n))`.
Caused by warning in `sprintf()`:
! one argument not used by format '%d'
Look for the overlapping genes
down.overlap <- intersect(B,D)
down.overlap
[1] "GJA1" "RBM47" "LRP1B" "HERC2P3" "AC105402.3"
[6] "PURPL" "AC105411.1" "BCL11A" "GPC3" "EMX2OS"
[11] "HOXB3" "AC104041.1" "GCNT1" "AL035425.2" "HOXD3"
[16] "WNT8B" "AC023794.1" "HOXB9" "MIR34AHG" "HOXA3"
[21] "HOXC6" "ARL17B" "FEZF1-AS1" "COL14A1" "ZMAT3"
[26] "MECOM" "PCSK5" "IRS4" "KCNMB2-AS1" "HOXC4"
[31] "MIR9-3HG" "HOXB6" "WNT7B" "MIR646HG" "NR3C2"
[36] "MDM2" "TTC29" "LINC02334" "EGF" "KIRREL1"
[41] "APP" "PCDH7" "BCL11B" "PCAT1" "LINC02253"
[46] "RFTN1" "ADAM23" "CDH4" "ADCY2" "TVP23C"
[51] "MIR181A1HG" "PAM" "DCDC1" "ADAMTS19" "PLD5"
[56] "CDH20" "PGAP1" "PRKCE" "SLCO3A1" "OPHN1"
[61] "ROR2" "GRID2"
Check DGE between cell lines
list.results.lines <- list()
df.trans <- as.data.frame(t(df))
dim(df.trans)
[1] 74 58395
#df.trans[1:3,1:5]
#dim(df.meta)
# Loop through each cell type and perform DESeq analysis
for (i in Celltypes) {
# Subset the expression dataframe by the current cell type
print(i)
df_sub <- df.trans[grepl(paste0("^",i,"_"), rownames(df.trans)), ]
print(dim(df_sub))
df.meta_sub <- df.meta[df.meta$Celltype == i, ]
print(dim(df.meta_sub))
# Prepare the DESeq object
dft <- as.data.frame(t(df_sub)) # Transpose the subset dataframe to get genes as rows and samples as columns
dfi <- lapply(dft, as.integer)
dfi <- as.data.frame(dfi)
rownames(dfi) <- rownames(dft)
# Create the DESeqDataSet object using the subset dataframe and metadata
dds <- DESeqDataSetFromMatrix(countData = dfi, colData = df.meta_sub, design = ~Line)
# Perform DESeq analysis
dds <- DESeq(dds)
# Store the DESeq results in the list with the cell type as the list index
res <- results(dds)
list2 <- list()
list2[["dds"]] <- dds
list2[["results"]] <- res
list.results.lines[[i]] <- list2
}
[1] "NPC"
[1] 12 58395
[1] 12 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neurons"
[1] 12 58395
[1] 12 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "NPC.div"
[1] 12 58395
[1] 12 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neuro.NPC"
[1] 12 58395
[1] 12 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neural.Stem"
[1] 12 58395
[1] 12 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Stem"
[1] 10 58395
[1] 10 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neural.epi"
[1] 4 58395
[1] 4 5
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
summary(list.results.lines$NPC$results)
out of 34278 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 1075, 3.1%
LFC < 0 (down) : 940, 2.7%
outliers [1] : 95, 0.28%
low counts [2] : 13439, 39%
(mean count < 3)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
head(list.results.lines$NPC$results)
log2 fold change (MLE): Line x3448B vs TD07B
Wald test p-value: Line x3448B vs TD07B
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
TSPAN6 252.946742 0.77507623 0.290873 2.6646566 0.0077067 0.0856281
TNMD 8.869491 4.96264208 1.739830 2.8523718 NA NA
DPM1 382.043787 -0.00542539 0.150453 -0.0360603 0.9712342 0.9949323
SCYL3 221.859838 0.02439490 0.175854 0.1387225 0.8896695 0.9748766
C1orf112 599.106196 -0.10184738 0.272472 -0.3737899 0.7085606 0.9170440
FGR 0.344874 -0.34434998 4.407356 -0.0781307 0.9377241 NA
To see each contrast:
colnames(x2965_vs_x3448_results)
[1] "baseMean" "log2FoldChange" "lfcSE" "stat"
[5] "pvalue" "padj"
# Initialize an empty list to store the results
all_results <- list()
# Loop through each contrast and calculate the results
for (i in 1:length(all_contrasts)) {
for (j in (i+1):length(all_contrasts)) {
contrast_level1 <- all_contrasts[i]
contrast_level2 <- all_contrasts[j]
# Check if both levels are present in dds$Line
if (contrast_level1 %in% dds$Line & contrast_level2 %in% dds$Line) {
contrast_name <- paste("Line", contrast_level1, "vs", contrast_level2)
contrast_result <- results(dds, contrast = c("Line", contrast_level1, contrast_level2), name = contrast_name)
all_results[[contrast_name]] <- contrast_result
}
}
}
Error in checkContrast(contrast, resNames) :
x3448B and x3448B should be different level names
Look at other contrasts instead of PD vs HC - HC + PD vs HC + PD and the reverse combo
list2 <- list()
# Loop through each cell type and perform DESeq analysis
for (i in Celltypes) {
# Subset the expression dataframe by the current cell type
print(i)
df_sub <- df.trans[grepl(paste0("^",i,"_"), rownames(df.trans)), ]
print(dim(df_sub))
df.meta_sub <- df.meta[df.meta$Celltype == i, ]
print(dim(df.meta_sub))
# Prepare the DESeq object
dft <- as.data.frame(t(df_sub)) # Transpose the subset dataframe to get genes as rows and samples as columns
dfi <- lapply(dft, as.integer)
dfi <- as.data.frame(dfi)
rownames(dfi) <- rownames(dft)
# Create the DESeqDataSet object using the subset dataframe and metadata
dds <- DESeqDataSetFromMatrix(countData = dfi, colData = df.meta_sub, design = ~DiseaseStatus)
# Perform DESeq analysis
dds <- DESeq(dds)
# Store the DESeq results in the list with the cell type as the list index
res <- results(dds)
list2[["dds.DiseaseStatus"]] <- dds
list2[["results.DiseaseStatus"]] <- res
# check the other cell type combination mixes
dds <- DESeqDataSetFromMatrix(countData = dfi, colData = df.meta_sub, design = ~Mix1)
# Perform DESeq analysis
dds <- DESeq(dds)
# Store the DESeq results in the list with the cell type as the list index
res <- results(dds)
list2[["dds.Mix1"]] <- dds
list2[["results.Mix1"]] <- res
# check the other cell type combination mixes
dds <- DESeqDataSetFromMatrix(countData = dfi, colData = df.meta_sub, design = ~Mix2)
# Perform DESeq analysis
dds <- DESeq(dds)
# Store the DESeq results in the list with the cell type as the list index
res <- results(dds)
list2[["dds.Mix2"]] <- dds
list2[["results.Mix2"]] <- res
list.results.lines[[i]] <- list2
}
[1] "NPC"
[1] 12 58395
[1] 12 7
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neurons"
[1] 12 58395
[1] 12 7
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
*** recursive gc invocation
[1] "NPC.div"
[1] 12 58395
[1] 12 7
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neuro.NPC"
[1] 12 58395
[1] 12 7
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neural.Stem"
[1] 12 58395
[1] 12 7
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Stem"
[1] 10 58395
[1] 10 7
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[1] "Neural.epi"
[1] 4 58395
[1] 4 7
Warning in DESeqDataSet(se, design = design, ignoreRank) :
some variables in design formula are characters, converting to factors
Error in DESeqDataSet(se, design = design, ignoreRank) :
design has a single variable, with all samples having the same value.
use instead a design of '~ 1'. estimateSizeFactors, rlog and the VST can then be used
Summarize the results for NPC for the different contrasts
Have a look at the DGE for NPC
summary(res)
out of 34278 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 266, 0.78%
LFC < 0 (down) : 469, 1.4%
outliers [1] : 166, 0.48%
low counts [2] : 12780, 37%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
NPC PD vs Control Volcano
df$X <- rownames(df)
pseudoDGE <- getGSA(df, up_or_down = "both",
LFCthresh = 0,
pval_thresh = 0.01)
Uploading data to Enrichr... Done.
Querying KEGG_2019_Human... Done.
Querying GWAS_Catalog_2019... Done.
Querying GO_Biological_Process_2023... Done.
Querying GO_Cellular_Component_2023... Done.
Querying GO_Molecular_Function_2023... Done.
Parsing results... Done.
GSEA
Between batches - NPC - not complete
i = "NPC"
df_sub <- df.trans[grepl(paste0("^",i,"_"), rownames(df.trans)), ]
print(dim(df_sub))
df.meta_sub <- df.meta[df.meta$Celltype == i, ]
print(dim(df.meta_sub))
# Prepare the DESeq object
dft <- as.data.frame(t(df_sub)) # Transpose the subset dataframe to get genes as rows and samples as columns
dfi <- lapply(dft, as.integer)
dfi <- as.data.frame(dfi)
rownames(dfi) <- rownames(dft)
# Create the DESeqDataSet object using the subset dataframe and metadata
dds <- DESeqDataSetFromMatrix(countData = dfi, colData = df.meta_sub, design = ~Batch)
res <- results(dds)